{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "802043b7",
   "metadata": {},
   "source": [
    "### polars/pandas examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "54cd50bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip uninstall twitter-api-client -y\n",
    "# !pip install twitter-api-client --no-cache-dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a3172006",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "from pathlib import Path\n",
    "\n",
    "import orjson\n",
    "import pandas as pd\n",
    "import polars as pl\n",
    "\n",
    "from twitter.util import find_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4703bee3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_int(tdf: pl.LazyFrame, *args) -> pl.LazyFrame:\n",
    "    return tdf.with_columns(pl.col(col).cast(pl.Int64, strict=False).alias(col) for col in args)\n",
    "\n",
    "\n",
    "def to_dt(tdf: pl.LazyFrame, fmt: str, *args) -> pl.LazyFrame:\n",
    "    return tdf.with_columns(pl.col(col).str.strptime(pl.Datetime, fmt).alias(col) for col in args)\n",
    "\n",
    "\n",
    "def get_data(path: Path, expr: str = '', **kwargs) -> dict:\n",
    "    D = {}\n",
    "    for p in path.rglob('*'):\n",
    "        if re.search(expr, p.name, **kwargs):\n",
    "            D.setdefault(p.stem.split('_')[-1], []).append(orjson.loads(p.read_bytes()))\n",
    "    return D"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b0addc33",
   "metadata": {},
   "outputs": [],
   "source": [
    "PATH = Path('data/raw')\n",
    "\n",
    "# filter for users who favorited or retweeted a tweet\n",
    "data = get_data(PATH, expr='Favoriters|Retweeters')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09efb374",
   "metadata": {},
   "source": [
    "### polars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e3a70d0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_user_details(data: dict, cols: list = None, sort: str = 'created_at') -> pl.LazyFrame:\n",
    "    numeric = [\n",
    "        'fast_followers_count',\n",
    "        'favourites_count',\n",
    "        'followers_count',\n",
    "        'friends_count',\n",
    "        'listed_count',\n",
    "        'media_count',\n",
    "        'normal_followers_count',\n",
    "        'statuses_count',\n",
    "    ]\n",
    "\n",
    "    D = []\n",
    "    for u in find_key(data, 'user_results'):\n",
    "        x = u.get('result', {})\n",
    "        y = x.get('rest_id')\n",
    "        if z := x.get('legacy', {}):\n",
    "            D.append({'rest_id': y} | z)\n",
    "\n",
    "    return (\n",
    "        pl.LazyFrame(D)\n",
    "        .unique(subset='rest_id')\n",
    "        .pipe(to_dt, '%a %b %d %H:%M:%S %z %Y', 'created_at')\n",
    "        .pipe(to_int, *numeric)\n",
    "        .sort(sort.strip(\"-\"), descending=\"-\" not in sort)\n",
    "        .select(cols)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "91495fc2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr > th,\n",
       ".dataframe > tbody > tr > td {\n",
       "  text-align: right;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (1855, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>created_at</th><th>screen_name</th><th>followers_count</th></tr><tr><td>datetime[μs, +00:00]</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>2007-03-31 01:16:45 +00:00</td><td>&quot;TheLos&quot;</td><td>1601</td></tr><tr><td>2008-03-18 19:04:59 +00:00</td><td>&quot;wickedjava&quot;</td><td>2986</td></tr><tr><td>2008-04-17 17:30:21 +00:00</td><td>&quot;needless_input...</td><td>218</td></tr><tr><td>2008-06-27 08:58:13 +00:00</td><td>&quot;DebrisStorm&quot;</td><td>178</td></tr><tr><td>2008-07-26 21:58:07 +00:00</td><td>&quot;daka17&quot;</td><td>66</td></tr><tr><td>2008-09-03 23:27:25 +00:00</td><td>&quot;heyitsaaron&quot;</td><td>1230</td></tr><tr><td>2008-09-11 23:37:14 +00:00</td><td>&quot;marinamiss&quot;</td><td>771</td></tr><tr><td>2008-09-18 13:59:25 +00:00</td><td>&quot;shangrila79&quot;</td><td>229</td></tr><tr><td>2008-10-11 07:18:09 +00:00</td><td>&quot;fridayschild71...</td><td>183</td></tr><tr><td>2008-10-27 19:40:43 +00:00</td><td>&quot;Jacelendrahz&quot;</td><td>188</td></tr><tr><td>2008-11-06 21:50:56 +00:00</td><td>&quot;yolo_pinyato&quot;</td><td>2944</td></tr><tr><td>2008-12-05 07:33:23 +00:00</td><td>&quot;El_Dandy40&quot;</td><td>205</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>2023-02-06 15:48:26 +00:00</td><td>&quot;CosmicGhidorah...</td><td>11</td></tr><tr><td>2023-02-08 21:09:17 +00:00</td><td>&quot;backupfHell&quot;</td><td>14</td></tr><tr><td>2023-02-09 19:24:12 +00:00</td><td>&quot;KayFabulous80&quot;</td><td>144</td></tr><tr><td>2023-02-14 04:06:11 +00:00</td><td>&quot;HDBNGRClub&quot;</td><td>3</td></tr><tr><td>2023-02-16 18:38:48 +00:00</td><td>&quot;SladjaMilov14&quot;</td><td>1</td></tr><tr><td>2023-02-17 22:38:58 +00:00</td><td>&quot;c0pas27&quot;</td><td>53</td></tr><tr><td>2023-02-19 06:35:24 +00:00</td><td>&quot;B4NKSCLUB&quot;</td><td>13</td></tr><tr><td>2023-02-19 07:06:15 +00:00</td><td>&quot;Later_Hayter&quot;</td><td>54</td></tr><tr><td>2023-02-21 06:47:49 +00:00</td><td>&quot;hart_kanya&quot;</td><td>2</td></tr><tr><td>2023-02-26 09:43:04 +00:00</td><td>&quot;_Val_Nichole&quot;</td><td>62</td></tr><tr><td>2023-03-04 23:50:32 +00:00</td><td>&quot;Chublosophy&quot;</td><td>346</td></tr><tr><td>2023-03-05 20:56:30 +00:00</td><td>&quot;Erron_20&quot;</td><td>8</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (1855, 3)\n",
       "┌────────────────────────────┬────────────────┬─────────────────┐\n",
       "│ created_at                 ┆ screen_name    ┆ followers_count │\n",
       "│ ---                        ┆ ---            ┆ ---             │\n",
       "│ datetime[μs, +00:00]       ┆ str            ┆ i64             │\n",
       "╞════════════════════════════╪════════════════╪═════════════════╡\n",
       "│ 2007-03-31 01:16:45 +00:00 ┆ TheLos         ┆ 1601            │\n",
       "│ 2008-03-18 19:04:59 +00:00 ┆ wickedjava     ┆ 2986            │\n",
       "│ 2008-04-17 17:30:21 +00:00 ┆ needless_input ┆ 218             │\n",
       "│ 2008-06-27 08:58:13 +00:00 ┆ DebrisStorm    ┆ 178             │\n",
       "│ …                          ┆ …              ┆ …               │\n",
       "│ 2023-02-21 06:47:49 +00:00 ┆ hart_kanya     ┆ 2               │\n",
       "│ 2023-02-26 09:43:04 +00:00 ┆ _Val_Nichole   ┆ 62              │\n",
       "│ 2023-03-04 23:50:32 +00:00 ┆ Chublosophy    ┆ 346             │\n",
       "│ 2023-03-05 20:56:30 +00:00 ┆ Erron_20       ┆ 8               │\n",
       "└────────────────────────────┴────────────────┴─────────────────┘"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lf = get_user_details(\n",
    "    data,\n",
    "    cols=['created_at', 'screen_name', 'followers_count'],\n",
    "    sort='-created_at',\n",
    ")\n",
    "\n",
    "lf.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03aa8cc0",
   "metadata": {},
   "source": [
    "### pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4815e47f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_user_details2(data: dict, cols: list = None, sort: str = 'created_at') -> pd.DataFrame:\n",
    "    D = []\n",
    "    for u in find_key(data, 'user_results'):\n",
    "        x = u.get('result', {})\n",
    "        y = x.get('rest_id')\n",
    "        if z := x.get('legacy', {}):\n",
    "            D.append({'rest_id': y} | z)\n",
    "    df = (\n",
    "        pd.DataFrame(D)\n",
    "        .drop_duplicates('rest_id')\n",
    "        .assign(created_at=lambda x: pd.to_datetime(x['created_at']))\n",
    "        .sort_values(sort.strip('-'), ascending='-' in sort)\n",
    "        .reset_index(drop=True)\n",
    "    )\n",
    "    n = [x for x in df.columns if 'count' in x]\n",
    "    df[n] = df[n].apply(pd.to_numeric, errors='coerce')\n",
    "    return df[cols] if cols else df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "feb0251b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>created_at</th>\n",
       "      <th>screen_name</th>\n",
       "      <th>followers_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2007-03-31 01:16:45+00:00</td>\n",
       "      <td>TheLos</td>\n",
       "      <td>1601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2008-03-18 19:04:59+00:00</td>\n",
       "      <td>wickedjava</td>\n",
       "      <td>2986</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2008-04-17 17:30:21+00:00</td>\n",
       "      <td>needless_input</td>\n",
       "      <td>218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2008-06-27 08:58:13+00:00</td>\n",
       "      <td>DebrisStorm</td>\n",
       "      <td>178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2008-07-26 21:58:07+00:00</td>\n",
       "      <td>daka17</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1850</th>\n",
       "      <td>2023-02-19 07:06:15+00:00</td>\n",
       "      <td>Later_Hayter</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1851</th>\n",
       "      <td>2023-02-21 06:47:49+00:00</td>\n",
       "      <td>hart_kanya</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1852</th>\n",
       "      <td>2023-02-26 09:43:04+00:00</td>\n",
       "      <td>_Val_Nichole</td>\n",
       "      <td>62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1853</th>\n",
       "      <td>2023-03-04 23:50:32+00:00</td>\n",
       "      <td>Chublosophy</td>\n",
       "      <td>346</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1854</th>\n",
       "      <td>2023-03-05 20:56:30+00:00</td>\n",
       "      <td>Erron_20</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1855 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                    created_at     screen_name  followers_count\n",
       "0    2007-03-31 01:16:45+00:00          TheLos             1601\n",
       "1    2008-03-18 19:04:59+00:00      wickedjava             2986\n",
       "2    2008-04-17 17:30:21+00:00  needless_input              218\n",
       "3    2008-06-27 08:58:13+00:00     DebrisStorm              178\n",
       "4    2008-07-26 21:58:07+00:00          daka17               66\n",
       "...                        ...             ...              ...\n",
       "1850 2023-02-19 07:06:15+00:00    Later_Hayter               54\n",
       "1851 2023-02-21 06:47:49+00:00      hart_kanya                2\n",
       "1852 2023-02-26 09:43:04+00:00    _Val_Nichole               62\n",
       "1853 2023-03-04 23:50:32+00:00     Chublosophy              346\n",
       "1854 2023-03-05 20:56:30+00:00        Erron_20                8\n",
       "\n",
       "[1855 rows x 3 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PATH = Path('data/raw')\n",
    "\n",
    "data = get_data(PATH, expr='Favoriters|Retweeters') # filter for users who favorited or retweeted a tweet\n",
    "\n",
    "df = get_user_details2(\n",
    "    data,\n",
    "    cols = ['created_at','screen_name','followers_count'],\n",
    "    sort = '-created_at',\n",
    ")\n",
    "\n",
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
